_base_: ./pretrain_gpt_base.yaml

Global:
  seed: 1234

  global_batch_size: 480
  local_batch_size: 
  micro_batch_size: 4


Engine:
  max_steps: 200000
  eval_freq: 1000
  eval_iters: 10
  save_load:
    save_steps: 500


Model:
  vocab_size: 50432
  hidden_size: 5120
  num_layers: 40
  num_attention_heads: 40
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 4096
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity: 'full'
  no_recompute_layers:


Data:
  Train:
    dataset:
      max_seq_len: 4096
  
  Eval:
    dataset:
      max_seq_len: 4096


Distributed:
  dp_degree:
  mp_degree: 2
  pp_degree: 8
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Optimizer:
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 160000
    warmup_rate: 0.001
    max_lr: 1.0e-4
    min_lr: 1.0e-5
